# -*- coding: utf-8 -*-
"""
This code generates seeds for the air LDA. Run this code only once, or else
the seeds will be overwritten and you will not be able to replicate your results

@author: samir
"""

import pandas as pd
import topicmodels
import topicmodels.preprocess
import os
import numpy as np
from paths import target_path, air_LDA_path #importing target_path from paths.py


def main():
    print("Running: Air LDA Seeds")

if __name__ == "__main__":
    main()

# Change to relevant directory
if os.path.exists(target_path):
    os.chdir(target_path)
else:
    print(f"Warning: {target_path} does not exist!")


#Import data
dataframe = pd.read_csv("Air_Subset/Processed_Data/Air_Complaints_lemmatized.csv", encoding="utf-8")
dataframe = dataframe.replace(np.nan, '', regex=True)
docsobj = topicmodels.RawDocs(dataframe.IncidentDescriptionLemma, "long")

#Preprocess lemmatized complaints
docsobj.token_clean(1)
docsobj.stopword_remove("tokens")

###Change directory to save LDA Outputs in correct folder
os.chdir(air_LDA_path)
docsobj.term_rank("tokens")

#Perform tf-idf removal
docsobj.rank_remove("tfidf", "tokens", docsobj.tfidf_ranking[6000][1])

###Generate seeds
##10 Topics
#Seed 1
ldaobj2 = topicmodels.LDA.LDAGibbs(docsobj.tokens, 10)
seed2 = ldaobj2.topic_seed
np.savetxt('6000_tfidf_10_Topics/seed.csv', seed2)

#Seed 2
ldaobj3 = topicmodels.LDA.LDAGibbs(docsobj.tokens, 10)
seed3 = ldaobj3.topic_seed
np.savetxt('6000_tfidf_10_Topics_seed_2/seed.csv', seed3)

#Seed 3
ldaobj4 = topicmodels.LDA.LDAGibbs(docsobj.tokens, 10)
seed4 = ldaobj4.topic_seed
np.savetxt('6000_tfidf_10_Topics_seed_3/seed.csv', seed4)